Rows: 17379 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Date
dbl (13): Season, Hour, Holiday, Day of the Week, Working Day, Weather Type,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plotts.sample.wge(bike_data$`Total Users`)$xbar
[1] 189.4631
# make the bike data daily instead of hourly daily_bike_data <- bike_data %>%group_by(Date =as.Date(Date, format ="%m/%d/%Y")) %>%summarise(Season =mean(Season),Hour =mean(Hour),Holiday =mean(Holiday),Day_of_the_Week =mean(`Day of the Week`),Working_Day =mean(`Working Day`),Weather_Type =mean(`Weather Type`),Temperature =mean(`Temperature F`),Temperature_Feels =mean(`Temperature Feels F`),Humidity =mean(Humidity),Wind_Speed =mean(`Wind Speed`),Casual_Users =sum(`Casual Users`),Registered_Users =sum(`Registered Users`),Total_Users =sum(`Total Users`), )plotts.sample.wge(daily_bike_data$Total_Users)$xbar
[1] 4504.349
Models
Univariate
ARMA Model
# create model plotts.sample.wge(daily_bike_data$Total_Users)$xbar # looking at the spectral density it looks like there may be wandering behavior
# plot of Last N forecasts for short and long term horizon arma_stfore1 =fore.arima.wge(daily_bike_data$Total_Users, phi = arma_est$phi, theta = arma_est$theta, n.ahead =7, lastn =TRUE)
t =1:length(daily_bike_data$Total_Users)plot(t[720:731],daily_bike_data$Total_Users[720:731], type ='l', xlab ="Time", ylab ="Total Users", main ='Last 7 Forecasts')points(t[725:731], arma_stfore1$f, type="l", lwd=2, lty =2, col ='blue')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', xlab ="Time", ylab ="Total Users", main ='Last 60 Forecasts')points(t[672:731], arma_ltfore1$f, type="l", lwd=2, lty =2,col ='blue')
# ASE arma_stASE =mean((daily_bike_data$Total_Users[725:731]-arma_stfore1$f)^2)arma_ltASE =mean((daily_bike_data$Total_Users[672:731]-arma_ltfore1$f)^2)arma_stASE
[1] 3358954
arma_ltASE
[1] 3058217
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# arma_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = arma_est$phi, theta = arma_est$theta)$rwRMSE# arma_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = arma_est$phi, theta = arma_est$theta)$rwRMSEarma_strwRMSE =991.812arma_ltrwRMSE =1224.59arma_stfore2 =fore.arima.wge(daily_bike_data$Total_Users, phi = arma_est$phi, theta = arma_est$theta, n.ahead =7)
# Plots of the short and Long Term Forecasts t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', xlim =c(670,745), xlab ="Time", ylab ="Total Users", main ='7 Day Forecast ARMA(4,1)')points(t[732:738],arma_stfore2$f, type ='l', col ='blue')points(t[732:738],arma_stfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],arma_stfore2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', xlim =c(670,795), xlab ="Time", ylab ="Total Users", main ='60 Day Forecast ARMA(4,1)')points(t[732:791],arma_ltfore2$f, type ='l', col ='blue')points(t[732:791],arma_ltfore2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],arma_ltfore2$ul, type ='l',lwd=2, lty =2, col ='red')
Non-Seasonal ARIMA Model
# Check if the data could be stationaryadf.test(daily_bike_data$Total_Users) # p-value 0.7, data not likely stationary
Augmented Dickey-Fuller Test
data: daily_bike_data$Total_Users
Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
alternative hypothesis: stationary
# Difference of 1total_d1 =artrans.wge(daily_bike_data$Total_Users, phi.tr =c(1))
# Model the residualsaic5.wge(total_d1, type ='bic') # bic and aic pick p = 1 and q = 1
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
1 1 13.68620
0 2 13.69304
2 1 13.69387
1 2 13.69455
3 1 13.69895
est =est.arma.wge(total_d1, p =1, q =1)
Coefficients of AR polynomial:
0.3591
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.3591B 2.7848 0.3591 0.0000
Coefficients of MA polynomial:
0.8903
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1-0.8903B 1.1232 0.8903 0.0000
plotts.sample.wge(est$res, arlimits =TRUE)$xbar # appears to be white noise
# Generate realizations for comparisonarima_gen1 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen2 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen3 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
arima_gen4 =gen.arima.wge(200, phi = est$phi, theta = est$theta, d =1)
plotts.sample.wge(arima_gen1)$xbar
[1] 7.816847
plotts.sample.wge(arima_gen2)$xbar
[1] 1.375558
plotts.sample.wge(arima_gen3)$xbar
[1] 2.152792
plotts.sample.wge(arima_gen4)$xbar # Similar behavior to original data
[1] 3.510853
# Compare Spectral Densities: sims =20SpecDen =parzen.wge(daily_bike_data$Total_Users, plot ="FALSE")plot(SpecDen$freq,SpecDen$pzgram, type ="l", lwd =6)for( i in1: sims){ SpecDen2 =parzen.wge(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(SpecDen2$freq,SpecDen2$pzgram, lwd =2, col ="red")}
# Compare ACFs:sims =20ACF =acf(daily_bike_data$Total_Users, plot ="FALSE")plot(ACF$lag ,ACF$acf , type ="l", lwd =6)for( i in1: sims){ ACF2 =acf(gen.aruma.wge(length(daily_bike_data$Total_Users), d =1, phi = est$phi, theta = est$theta, plot ="FALSE"), plot ="FALSE")lines(ACF2$lag ,ACF2$acf, lwd =2, col ="red")}
# Short and Long Term Forecastsfore_arima_st =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7, lastn =TRUE)
# ASE arima_stASE =mean((daily_bike_data$Total_Users[725:731]-fore_arima_st$f)^2)arima_ltASE =mean((daily_bike_data$Total_Users[672:731]-fore_arima_lt$f)^2)arima_stASE
[1] 3230238
arima_ltASE
[1] 3930824
# Rolling Window RMSE# RW-RMSE commented out due to obscene amount of unsupressable output# arima_strwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 7, phi = est$phi, theta = est$theta, d = 1)$rwRMSE# arima_ltrwRMSE = roll.win.rmse.wge(daily_bike_data$Total_Users, horizon = 60, phi = est$phi, theta = est$theta, d = 1)$rwRMSEarima_strwRMSE =1237.393arima_ltrwRMSE =1503.197# Plots of the short and Long Term Forecasts fore_arima_st_2 =fore.arima.wge(daily_bike_data$Total_Users, phi = est$phi, theta = est$theta, d =1, n.ahead =7)
t =1:800plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Short Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:738],fore_arima_st_2$f, type ='l', col ='blue')points(t[732:738],fore_arima_st_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:738],fore_arima_st_2$ul, type ='l',lwd=2, lty =2, col ='red')
plot(t[670:731],daily_bike_data$Total_Users[670:731], type ='l', main ="Long Term Forecast", xlim =c(670,795), xlab ="Time", ylab ="Total Users")points(t[732:791],fore_arima_lt_2$f, type ='l', col ='blue')points(t[732:791],fore_arima_lt_2$ll, type ='l',lwd=2, lty =2, col ='red')points(t[732:791],fore_arima_lt_2$ul, type ='l',lwd=2, lty =2, col ='red')
The models in factored form or at least separate the stationary and non- stationary factors with standard deviation or variance of the white noise.
AIC
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts)
At least 10 superimposed spectral densities from 10 generated realizations like we did in Unit 11. Use these to help choose between the at least two candidate univariate models.
Visualization of Forecasts for both the short- and long-term Horizons.
Be sure and include confidence intervals when possible (I don’t have code for confidence intervals from MLP models at the moment… but that would be a good thing to work on! )
Multivariate
Include an ASE (rolling window is not yet available in multivariate models)
Short Horizon (you pick the length.. could be one step ahead)
Long Horizon (you pick … just must be longer than the short horizon.)
Describe the explanatory variable(s) used in the model and why you felt they were significant / important.
Visualization of Forecasts for both the short- and long-Horizons.
Be sure and include confidence intervals if using VAR …
MLR With Correlated Errors
ccf(daily_bike_data$Total_Users,daily_bike_data$Humidity) # no lag
daily_bike_data$Humidity_lag = dplyr::lag(daily_bike_data$Humidity,4)# Short Term Prediction Fitfit1 =lm(Total_Users~Humidity + Temperature + Day_of_the_Week + Wind_Speed, data = daily_bike_data[1:724,])plotts.sample.wge(fit1$residuals)$xbar
[1] 3.850278e-13
aic5.wge(fit1$residuals, type ='bic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.33609
3 1 13.34050
4 1 13.34482
3 2 13.34737
5 1 13.35261
aic5.wge(fit1$residuals, type ='aic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of aic
p q aic
4 1 13.30683
5 1 13.30828
3 1 13.30884
3 2 13.30937
2 1 13.31076
mlr1 =arima(daily_bike_data$Total_Users[1:724], order =c(2,0,1),xreg =cbind(daily_bike_data$Humidity[1:724], daily_bike_data$Temperature[1:724], daily_bike_data$Day_of_the_Week[1:724], daily_bike_data$Wind_Speed[1:724]))plotts.wge(mlr1$residuals) # looks random
acf(mlr1$residuals) # none of the acfs out of bounds
# Long Term Prediction Fitfit2 =lm(Total_Users~Humidity + Temperature + Day_of_the_Week + Wind_Speed, data = daily_bike_data[1:671,])plotts.sample.wge(fit2$residuals)$xbar
[1] -3.659666e-13
aic5.wge(fit2$residuals, type ='bic')
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
2 1 13.31773
3 1 13.32523
4 1 13.32670
1 2 13.33071
3 2 13.33144
mlr2 =arima(daily_bike_data$Total_Users[1:671], order =c(2,0,1),xreg =cbind(daily_bike_data$Humidity[1:671], daily_bike_data$Temperature[1:671], daily_bike_data$Day_of_the_Week[1:671], daily_bike_data$Wind_Speed[1:671]))plotts.wge(mlr2$residuals) # looks random
acf(mlr2$residuals[-(1:5)]) # 0/20 acfs out of bounds
aic5.wge(tempdiff, type ='bic') # bic picked p = 1 and q = 1
---------WORKING... PLEASE WAIT...
Five Smallest Values of bic
p q bic
1 1 3.714381
2 0 3.720397
3 0 3.722154
0 2 3.723965
1 2 3.726426
temp =est.arma.wge(tempdiff, p =1, q =1)
Coefficients of AR polynomial:
0.4839
AR Factor Table
Factor Roots Abs Recip System Freq
1-0.4839B 2.0667 0.4839 0.0000
Coefficients of MA polynomial:
-0.3906
MA FACTOR TABLE
Factor Roots Abs Recip System Freq
1+0.3906B -2.5605 0.3906 0.5000
acf(temp$res)
ljung.wge(temp$res) # close to 0.05 but Fail to reject
# Year Out Forecastmlr_lt_pred3 =predict(mlr_forecast, newxreg = ahead[1:365,], n.ahead =365, lastn=FALSE)plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(1,1100), main ="365 day MLR Forecast", xlab ='Time', ylab ='Total Users')points(seq(732,1096,1),mlr_lt_pred3$pred,type ='l', pch =1,col ='blue',lwd=1, lty =1)
MLP Model
ASE (short and long term forecasts)
Rolling Window RMSE (short and long term forecasts (only if univariate)
Visualization of Forecasts for both the short- and long-term Horizons.
Confidence / Prediction intervals are not required (I don’t have code for confidence / prediction intervals (bootstrap intervals) for MLP models at the moment… but that would be a good thing to work on! )
# Convert tibble to a data frame with ts() objectsbike_DF <- daily_bike_data %>% dplyr::select(-c(Date, Hour, Holiday, Temperature_Feels, Casual_Users, Registered_Users)) %>%# Remove unnecessary columnsmutate(across(everything(), ~zoo(.x, order.by = daily_bike_data$Date))) %>%# Convert each column to a zoo objectmutate(across(everything(), ~as.ts(.x))) %>%# Convert zoo objects to ts objectsas.data.frame() # Convert the tibble to a data framebikeShortTrain = bike_DF[1:724,]bikeShortTest = bike_DF[725:731,]bikeLongTrain = bike_DF[1:671,]bikeLongTest = bike_DF[672:731,]seed =137
# Forecast Short-term predictor variables using MLPset.seed(seed)fit.mlp.short.Season =mlp(ts(bikeShortTrain[,"Season"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Season =forecast(fit.mlp.short.Season, h =7)plot(fore.mlp.short.Season)
fit.mlp.short.Day_of_the_Week =mlp(ts(bikeShortTrain[,"Day_of_the_Week"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Day_of_the_Week =forecast(fit.mlp.short.Day_of_the_Week, h =7)fore.mlp.short.Day_of_the_Week$mean =round(fore.mlp.short.Day_of_the_Week$mean) # Round to nearest whole numberplot(fore.mlp.short.Day_of_the_Week)
fit.mlp.short.Working_Day =mlp(ts(bikeShortTrain[,"Working_Day"]),reps =10, difforder =0, comb ="mean", det.type ="bin")fore.mlp.short.Working_Day =forecast(fit.mlp.short.Working_Day, h =7)fore.mlp.short.Working_Day$mean =round(fore.mlp.short.Working_Day$mean) # Round to nearest whole numberplot(fore.mlp.short.Working_Day)
# Forecast and evaluate ASE for short and longfore.mlp.st =forecast(fit.mlp.st, h =7, xreg = BDF_fore_short)fore.mlp.lt =forecast(fit.mlp.lt, h =60, xreg = BDF_fore_long)ASE.mlp.st =mean((bikeShortTest$Total_Users - fore.mlp.st$mean)^2)ASE.mlp.lt =mean((bikeLongTest$Total_Users - fore.mlp.lt$mean)^2)print(paste("MLP ASE Short Term: ", round(ASE.mlp.st, 2)))
[1] "MLP ASE Short Term: 4177344.82"
print(paste("MLP ASE Long Term: ", round(ASE.mlp.lt, 2)))
[1] "MLP ASE Long Term: 10000456.14"
plot(fore.mlp.st)
plot(fore.mlp.lt)
# Plot the forecastst =1:731plot(t[720:731],bike_DF$Total_Users[720:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Short Term Forecast")lines(t[725:731], fore.mlp.st$mean, type="l", lwd=2, lty =2, col ='blue')
plot(t[600:731],bike_DF$Total_Users[600:731], type ='l', xlab ="Time", ylab ="Total Users", main ="MLP Long Term Forecast")lines(t[672:731], fore.mlp.lt$mean, type="l", lwd=2, lty =2, col ='blue')
Warning in `+.default`(arma_stfore1$f, mlr_lt_pred$pred): longer object length
is not a multiple of shorter object length
plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(720,731), xlab='Time', main ='Last 7 Day Forecast', ylab ='Total Users')points(seq(725,731,1),ensemble_st,type ='l', col ='blue')
plot(seq(1,731,1),daily_bike_data$Total_Users,type ='l',xlim =c(650,731), xlab='Time', main ='Last 60 Day Forecast', ylab ='Total Users')points(seq(672,731,1),ensemble_lt,type ='l', col ='blue')